# -*- coding: utf-8 -*-
import pandas as pd
import jieba
from settings import path
import numpy as np

file_train = path.path_train_txt  # 训练集
file_test = path.path_test_txt  # 测试集


def cut_sentence(s):
    return ' '.join(list(jieba.cut(s))[:30])


# 进行分词预处理
def cut_word():
    data = pd.read_csv(path.path_train_txt, names=['text', 'label'], sep='\t', encoding='utf')
    data['word'] = data['text'].apply(cut_sentence)  # 这个速度可能比下面那个更快
    # data['word'] = list(map(cut_sentence, data['text']))
    # print(data.head(10))
    data.to_csv(path.path_train_processed_txt, index=False)


# 清洗数据集
def data_test_processing():
    # 跳过第一列（标签），仅加载后两列文本
    data = np.loadtxt(file_test, dtype=str, usecols=(1, 2), delimiter='\t', encoding='utf-8')
    np.savetxt(path.path_test1_txt, data, fmt='%s', delimiter='\t',encoding='utf-8')


data_test_processing()
